Setup

setwd("~/sc_covid_PiB2023/data_science_project/Data") # path to data folder
dat <- read_h5ad("Pilot_2_rule_them_ALL.h5ad")
sc <- import('scanpy', convert = FALSE) # import scanpy as python module (by reticulate)
scvi <- import('scvi', convert = FALSE) # import scVI as python module (by reticulate)
scvi$settings$progress_bar_style = 'tqdm' # Set up a loading bar when running scVI

Selecting variable features

In cases where the data consists of fewer cells (5694) than genes (23382) scVI is expected to underfit the data, potentially leading to worse imputation accuracy. For scVI anywhere from 1.000 to 10.000 highly variable genes are recommended, but it will be context dependent. For now we choose 2000 HVGs.

To find HGVs we use the FindVariableFeatues function from the Seaurat package.

se_dat <- CreateSeuratObject(counts = t(dat$X)) # create Seurat object using expression matrix 
nbf = 3000 # number of variable features we wish to select 
se_dat <- FindVariableFeatures(se_dat, selection.method = "vst", nfeatures = nbf) # find number of variable features

Plot variable features:

top10 <- head(VariableFeatures(se_dat), 10) # We select 10 highest variable features 
topnbf <- head(VariableFeatures(se_dat), nbf) # select variable features (gene names) to use in scVI

p1 <- VariableFeaturePlot(se_dat) # Create feature plot
p2 <- LabelPoints(plot = p1, points = top10, repel = TRUE, xnudge = 0, ynudge = 0) # Overlay names of top 10 variable features

p2 # show plot
## Warning: Transformation introduced infinite values in continuous x-axis

sub_feature_dat <- dat[,topnbf] # Subset anndata object by selected variable features

Setup for training

Create and train model.

It should be noted that scvi models require the raw counts, but will run for non-negative real-valued data. Possible non-count values should be intended to represent pseudocounts, and not other normalized data, in which variance/covariance structure of the data has changes dramatically. Here we use normalized data.

sub_feature_dat = sub_feature_dat$copy()

# run setup_anndata
scvi$model$SCVI$setup_anndata(sub_feature_dat)
## None
# create model
model = scvi$model$SCVI(sub_feature_dat) 
# train model
model$train()

# to specify the number of epochs when training:
# model$train(max_epochs = as.integer(400))

Returns 10 dimensional latent space.

Visualize latent representation

Retrieve latent representation and visualize using UMAP and tSNE

latent = model$get_latent_representation()
latent <- data.matrix(latent)

The scVI latent space is saved onto the anndata object

dat$obsm$X_scVI <- latent # scVI latent space 
write_h5ad(dat, '../Data/Pilot_2_rule_them_ALL.h5ad')
latent_UMAP <- RunUMAP(dat$obsm$X_scVI, n.components = 2, min.dist = 0.5, n.neighbors = 70) # run UMAP
## 20:35:16 UMAP embedding parameters a = 0.583 b = 1.334
## 20:35:16 Read 5694 rows and found 10 numeric columns
## 20:35:16 Using Annoy for neighbor search, n_neighbors = 70
## 20:35:16 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 20:35:16 Writing NN index file to temp file /scratch/18941268/Rtmpcdm90W/filee13c40c4912d
## 20:35:16 Searching Annoy index using 1 thread, search_k = 7000
## 20:35:20 Annoy recall = 100%
## 20:35:20 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 70
## 20:35:21 Initializing from normalized Laplacian + noise (using irlba)
## 20:35:21 Commencing optimization for 500 epochs, with 475120 positive edges
## 20:35:41 Optimization finished
latent_tSNE <- RunTSNE(dat$obsm$X_scVI, check_duplicates = TRUE, perplexity = 60, max_iter = 2000, theta = 0.2) # run tSNE
latent_df <- as.data.frame(dat$obs) # add annotations to data frame

latent_df$UMAP_1 <- latent_UMAP@cell.embeddings[,1] # add UMAP coordinates
latent_df$UMAP_2 <- latent_UMAP@cell.embeddings[,2]

latent_df$tSNE_1 <- latent_tSNE@cell.embeddings[,1] # add tSNE coordinates
latent_df$tSNE_2 <- latent_tSNE@cell.embeddings[,2]

UMAP visualizations

cellType

p_UMAP_celltype <- latent_df %>%
  arrange(viralLoad) %>% 
  ggplot(aes(x = UMAP_1, y = UMAP_2, color = cellType)) + 
  geom_point(alpha = 0.7) +
  NULL

p_UMAP_celltype

Viral Load

latent_df %>%
  arrange(viralLoad) %>% 
  ggplot(aes(x = UMAP_1, y = UMAP_2, color = viralLoad)) + 
  geom_point(alpha = 0.7) +
  scale_color_gradient(low="blue", high="red") + 
  NULL

tSNE visualizations

Celltype

latent_df %>%
  arrange(viralLoad) %>% 
  ggplot(aes(x = tSNE_1, y = tSNE_2, color = cellType)) + 
  geom_point(alpha = 0.7) +
  NULL

latent_df %>%
  arrange(viralLoad) %>% 
  ggplot(aes(x = tSNE_1, y = tSNE_2, color = viralLoad)) + 
  geom_point(alpha = 0.7) +
  scale_color_gradient(low="blue", high="red") + 
  NULL

Batch correction

When setting up the model it is possible to register batches and the subsequent model will correct for batch effects. Here we wish to correct for patient batch effect.

# run setup_anndata, use PatientID for batch
scvi$model$SCVI$setup_anndata(sub_feature_dat, batch_key = "PatientID")
## None
model_c = scvi$model$SCVI(sub_feature_dat) # create model
# train model
model_c$train()

# to specify the number of epochs when training:
# model$train(max_epochs = as.integer(400))
latent_c = model_c$get_latent_representation()
latent_c <- data.matrix(latent_c)

The scVI latent space is saved onto the anndata object

dat$obsm$X_scVI_corrected <- latent_c
# write_h5ad(dat, '../Data/Pilot_2_rule_them_ALL.h5ad')
cor_latent_UMAP <- RunUMAP(dat$obsm$X_scVI_corrected, n.components = 2, min.dist = 0.5, n.neighbors = 70)
## 20:37:02 UMAP embedding parameters a = 0.583 b = 1.334
## 20:37:02 Read 5694 rows and found 10 numeric columns
## 20:37:02 Using Annoy for neighbor search, n_neighbors = 70
## 20:37:02 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 20:37:02 Writing NN index file to temp file /scratch/18941268/Rtmpcdm90W/filee13c72b9f671
## 20:37:02 Searching Annoy index using 1 thread, search_k = 7000
## 20:37:06 Annoy recall = 100%
## 20:37:06 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 70
## 20:37:07 Initializing from normalized Laplacian + noise (using irlba)
## 20:37:07 Commencing optimization for 500 epochs, with 490478 positive edges
## 20:37:27 Optimization finished
## Warning: No assay specified, setting assay as RNA by default.
cor_latent_tSNE <- RunTSNE(dat$obsm$X_scVI_corrected, check_duplicates = TRUE, perplexity = 60, max_iter = 2000, theta = 0.2)
latent_df$cor_UMAP_1 <- cor_latent_UMAP@cell.embeddings[,1] # add UMAP coordinates
latent_df$cor_UMAP_2 <- cor_latent_UMAP@cell.embeddings[,2]

latent_df$cor_tSNE_1 <- cor_latent_tSNE@cell.embeddings[,1] # add tSNE coordinates
latent_df$cor_tSNE_2 <- cor_latent_tSNE@cell.embeddings[,2]

UMAP visualizations

We compare the dispersion of PatientID in the latent before and after batch correction:

non_corrected <- latent_df %>% 
  ggplot(aes(x = UMAP_1, y = UMAP_2, color = PatientID)) + 
  geom_point(show.legend = FALSE, alpha = 0.7) + 
  ggtitle("Non-corrected")

corrected <- latent_df %>% 
  ggplot(aes(x = cor_UMAP_1, y = cor_UMAP_2, color = PatientID)) + 
  geom_point(alpha = 0.7) + ggtitle("Patient effect corrected")
  
non_corrected + corrected 

Likewise we compare bact-corrected and non-batch corrected on celltype and viralload annotations to see if these structures are preserved and potentially have become more apparent in the latent space once patient batch effect is removed.

non_corrected <- latent_df %>% 
  ggplot(aes(x = UMAP_1, y = UMAP_2, color = cellType)) + 
  geom_point(show.legend = FALSE, alpha = 0.7) + 
  ggtitle("Non-corrected")

corrected <- latent_df %>% 
  ggplot(aes(x = cor_UMAP_1, y = cor_UMAP_2, color = cellType)) + 
  geom_point(alpha = 0.7) + 
  ggtitle("Patient effect corrected")
  
non_corrected + corrected 

non_corrected <- latent_df %>% 
  arrange(viralLoad) %>% 
  ggplot(aes(x = UMAP_1, y = UMAP_2, color = viralLoad)) + 
  geom_point(show.legend = FALSE, alpha = 0.7) + 
  ggtitle("Non-corrected")+
  scale_color_gradient(low="blue", high="red")

corrected <- latent_df %>% 
  arrange(viralLoad) %>% 
  ggplot(aes(x = cor_UMAP_1, y = cor_UMAP_2, color = viralLoad)) + 
  geom_point(alpha = 0.7) + ggtitle("Patient effect corrected") +
  scale_color_gradient(low="blue", high="red")
  
non_corrected + corrected 

tSNE visualizations

The same visualizations have been repeated with tSNE.

non_corrected <- latent_df %>% 
  ggplot(aes(x = tSNE_1, y = tSNE_2, color = PatientID)) + 
  geom_point(show.legend = FALSE, alpha = 0.7) + 
  ggtitle("Non-corrected")

corrected <- latent_df %>% 
  ggplot(aes(x = cor_tSNE_1, y = cor_tSNE_2, color = PatientID)) + 
  geom_point(alpha = 0.7) + ggtitle("Patient effect corrected")
  
non_corrected + corrected 

non_corrected <- latent_df %>% 
  ggplot(aes(x = tSNE_1, y = tSNE_2, color = cellType)) + 
  geom_point(show.legend = FALSE, alpha = 0.7) + 
  ggtitle("Non-corrected")

corrected <- latent_df %>% 
  ggplot(aes(x = cor_tSNE_1, y = cor_tSNE_2, color = cellType)) + 
  geom_point(alpha = 0.7) + 
  ggtitle("Patient effect corrected")
  
non_corrected + corrected 

non_corrected <- latent_df %>% 
  arrange(viralLoad) %>% 
  ggplot(aes(x = tSNE_1, y = tSNE_2, color = viralLoad)) + 
  geom_point(show.legend = FALSE, alpha = 0.7) + 
  ggtitle("Non-corrected")+
  scale_color_gradient(low="blue", high="red")

corrected <- latent_df %>% 
  arrange(viralLoad) %>% 
  ggplot(aes(x = cor_tSNE_1, y = cor_tSNE_2, color = viralLoad)) + 
  geom_point(alpha = 0.7) + ggtitle("Patient effect corrected") +
  scale_color_gradient(low="blue", high="red")
  
non_corrected + corrected 

Finding differentially expressed genes with scVI latent space

An example use of scVI besides dimensionality reduction is for differential expression. Many differential expression models often use linear assumptions which may not capture complex batch effects, whereas deep generative models may not suffer from these problems by using complex nonlinear mappings.

We demonstrate a simple example of computing differential expression for cell type using scVI:

DE <- model$differential_expression(sub_feature_dat, groupby = "cellType", batch_correction = FALSE)
print(DE$head())
##             proba_de  proba_not_de  ...    group1  group2
## ETNPPL        0.8260        0.1740  ...  Ciliated    Rest
## CAMK4         0.8190        0.1810  ...  Ciliated    Rest
## CHAT          0.8170        0.1830  ...  Ciliated    Rest
## AL035078.1    0.8118        0.1882  ...  Ciliated    Rest
## IGKJ3         0.8108        0.1892  ...  Ciliated    Rest
## 
## [5 rows x 22 columns]

It is also possible to use batch correction. Here we use batch corretion for patient.

DE <- model_c$differential_expression(sub_feature_dat, groupby = "cellType", batch_correction = TRUE)
print(DE$head())
##           proba_de  proba_not_de  ...    group1  group2
## TRAV12-2    0.8266        0.1734  ...  Ciliated    Rest
## FAP         0.8234        0.1766  ...  Ciliated    Rest
## PMP22       0.8176        0.1824  ...  Ciliated    Rest
## MEFV        0.8160        0.1840  ...  Ciliated    Rest
## MXD3        0.8158        0.1842  ...  Ciliated    Rest
## 
## [5 rows x 22 columns]

Session Info

sessionInfo()
## R version 4.2.0 (2022-04-22)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
## 
## Matrix products: default
## BLAS/LAPACK: /home/anna7393/miniconda3/envs/scVI_env/lib/libopenblasp-r0.3.21.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] anndata_0.7.5.4    lubridate_1.9.2    forcats_1.0.0      stringr_1.5.0     
##  [5] dplyr_1.1.2        purrr_1.0.1        readr_2.1.4        tidyr_1.3.0       
##  [9] tibble_3.2.1       ggplot2_3.4.2      tidyverse_2.0.0    reticulate_1.25   
## [13] SeuratObject_4.1.3 Seurat_4.3.0      
## 
## loaded via a namespace (and not attached):
##   [1] Rtsne_0.16             colorspace_2.1-0       deldir_1.0-9          
##   [4] ellipsis_0.3.2         ggridges_0.5.4         rprojroot_2.0.3       
##   [7] rstudioapi_0.14        spatstat.data_3.0-1    farver_2.1.1          
##  [10] leiden_0.4.3           listenv_0.9.0          ggrepel_0.9.3         
##  [13] fansi_1.0.4            codetools_0.2-19       splines_4.2.0         
##  [16] cachem_1.0.8           knitr_1.43             polyclip_1.10-4       
##  [19] jsonlite_1.8.4         ica_1.0-3              cluster_2.1.4         
##  [22] png_0.1-8              uwot_0.1.14            shiny_1.7.4           
##  [25] sctransform_0.3.5      spatstat.sparse_3.0-1  compiler_4.2.0        
##  [28] httr_1.4.6             assertthat_0.2.1       Matrix_1.5-4.1        
##  [31] fastmap_1.1.1          lazyeval_0.2.2         cli_3.6.1             
##  [34] later_1.3.1            htmltools_0.5.5        tools_4.2.0           
##  [37] igraph_1.3.5           gtable_0.3.3           glue_1.6.2            
##  [40] RANN_2.6.1             reshape2_1.4.4         Rcpp_1.0.10           
##  [43] scattermore_1.1        jquerylib_0.1.4        vctrs_0.6.2           
##  [46] nlme_3.1-162           spatstat.explore_3.2-1 progressr_0.13.0      
##  [49] lmtest_0.9-40          spatstat.random_3.1-5  xfun_0.39             
##  [52] globals_0.16.2         timechange_0.2.0       mime_0.12             
##  [55] miniUI_0.1.1.1         lifecycle_1.0.3        irlba_2.3.5.1         
##  [58] goftest_1.2-3          future_1.32.0          MASS_7.3-60           
##  [61] zoo_1.8-12             scales_1.2.1           hms_1.1.3             
##  [64] promises_1.2.0.1       spatstat.utils_3.0-3   parallel_4.2.0        
##  [67] RColorBrewer_1.1-3     yaml_2.3.7             pbapply_1.7-0         
##  [70] gridExtra_2.3          sass_0.4.6             stringi_1.7.6         
##  [73] highr_0.10             rlang_1.1.1            pkgconfig_2.0.3       
##  [76] matrixStats_0.63.0     evaluate_0.21          lattice_0.21-8        
##  [79] ROCR_1.0-11            tensor_1.5             labeling_0.4.2        
##  [82] patchwork_1.1.2        htmlwidgets_1.6.2      cowplot_1.1.1         
##  [85] tidyselect_1.2.0       here_1.0.1             parallelly_1.36.0     
##  [88] RcppAnnoy_0.0.20       plyr_1.8.8             magrittr_2.0.3        
##  [91] R6_2.5.1               generics_0.1.3         withr_2.5.0           
##  [94] pillar_1.9.0           fitdistrplus_1.1-11    survival_3.5-5        
##  [97] abind_1.4-5            sp_1.6-0               future.apply_1.11.0   
## [100] KernSmooth_2.23-21     utf8_1.2.3             spatstat.geom_3.2-1   
## [103] plotly_4.10.1          tzdb_0.4.0             rmarkdown_2.14        
## [106] grid_4.2.0             data.table_1.14.8      digest_0.6.31         
## [109] xtable_1.8-4           httpuv_1.6.11          munsell_0.5.0         
## [112] viridisLite_0.4.1      bslib_0.4.2